# PREPARE
library(phyloseq)
library(ggpubr) # a handy helper package for ggplots
theme_set(theme_bw()) # setting the theme for ggplots
library(microbiome)
library(tidyverse)
library(DT)
`%notin%` <- Negate(`%in%`)
# MICROBIAL DATA
# reading in a previously saved phyloseq object. This contains microbial 16S-ASV abundances of all samples.
ps_1C <- readRDS("./data/ps_215samplesJul24")
# ps_1C
#phyloseq-class experiment-level object
#otu_table() OTU Table: [ 17961 taxa and 215 samples ]
#sample_data() Sample Data: [ 215 samples by 48 sample variables ]
#tax_table() Taxonomy Table: [ 17961 taxa by 7 taxonomic ranks ]
#phy_tree() Phylogenetic Tree: [ 17961 tips and 17822 internal nodes ]
### Pre-filtering microbial data
ps.flt <- prune_taxa(taxa_sums(ps_1C) >= 10, ps_1C) #m inimum reads per ASV
#phyloseq-class experiment-level object
#otu_table() OTU Table: [ 5855 taxa and 215 samples ]
#sample_data() Sample Data: [ 215 samples by 60 sample variables ]
#tax_table() Taxonomy Table: [ 5855 taxa by 7 taxonomic ranks ]
#phy_tree() Phylogenetic Tree: [ 5855 tips and 5763 internal nodes ]
# MASTERDATA
# change this path to where you store the .csv file. you can provide an absolute path like this. Check the path syntax for Windows as it might be different.
masterdata <- read.csv("./data/masterdataProject1C_May24.csv")
## Change date to date format in R
masterdata <- masterdata %>%
mutate(Date = lubridate::dmy(as.character(Date)) )
# Make AD, Treatment and SludgeType = factors
masterdata$AD <- factor(masterdata$AD, levels = c("AD1", "AD2","AD3", "AD4","AD5", "AD6", "Full-scale", "PSTWAS"))
masterdata$SludgeType <- factor(masterdata$SludgeType, levels = c("Control", "Treatment", "Foam", "Full-scale", "PSTWAS"))
masterdata$Treatment <- factor(masterdata$Treatment, levels = c("Control", "Treatment", "Full-scale", "PSTWAS"))
masterdata$Period <- factor(masterdata$Period, levels = c("Converging", "SteadyState", "Glycerol", "Inhibition", "Recovery/Feedingpause", "Recovery/Foaming", "Recovery/Postfoam", "SteadyState/Postfoam"))
# LOAD REFLECTANCE DATA
ATR <- read.csv("./data/ADVATR.csv")[-1,]
ATR <- ATR %>% rownames_to_column("ID")%>% column_to_rownames("X") %>% dplyr::select(-ID) %>%
mutate(across(S537_AD1:S1370_AD6, as.numeric))
# OPTION 1: raw
ATR <- as.matrix(ATR)
head(ATR) %>% datatable(caption = "ATR raw")
# OPTION 2: clr transformed
ATRclr <- compositions::clr(ATR)
head(ATRclr) %>% datatable(caption = "ATR clr")
# CREATE METADATA FOR 24 SELECTED SAMPLES AND MATCH SAMPLE NAMES WITH SPECTRA DF
filtervec <- c("2023-05-01", "2023-05-15", "2023-06-07","2023-06-12") # 12thJuneDNA = 13th June EPS
filtervec2 <- c("AD1", "AD2", "AD3", "AD4", "AD5","AD6") # check
metadata <- masterdata %>%
dplyr::filter(Date %in% filtervec &
AD %in% filtervec2)
metadata$SampleID.EPS <- colnames(ATR)
metadata <- metadata %>% column_to_rownames("SampleID.EPS")
head(metadata ) %>% datatable(caption = "metadata for 24 selected samples")
# CREATE A PHYLOSEQ OBJECTS (combined metadata and spectra)
psATR <-phyloseq(
otu_table(ATR, taxa_are_rows = T),
sample_data(metadata)
)
psATRclr <-phyloseq(
otu_table(ATRclr, taxa_are_rows = T),
sample_data(metadata)
)
# ABSORBANCE DATA
AVE <- read.csv("./data/AVE.csv")[-1,]
AVE <- AVE %>% rownames_to_column("ID")%>% column_to_rownames("X") %>% dplyr::select(-ID) %>%
mutate(across(S537_AD1:S1370_AD6, as.numeric))
# OPTION 1: raw
AVE <- as.matrix(AVE)
# OPTION 2: clr transformed
AVEclr <- compositions::clr(AVE)
# head(AVEclr)
# CREATE A PHYLOSEQ OBJECTS (combined metadata and spectra)
psAVE <-phyloseq(
otu_table(AVE, taxa_are_rows = T),
sample_data(metadata)
)
psAVEclr <-phyloseq(
otu_table(AVEclr, taxa_are_rows = T),
sample_data(metadata)
)
Just to see how microbial 16S abundances vary among the selected
samples. We expect treated/inhibited samples (15th May) to be different.
library(factoextra)
library(compositions)
my_comparisons <- list(c(1, 2))
symbolsize <- 3
ps.temp <- psATR
pca <- prcomp(otu_table(ps.temp), scale = FALSE, center = TRUE)
# summary(otu_table(ps.temp))
# head(otu_table(ps.temp))
data <- get_pca(pca)
cord <- data$coord # extract sample coordinates of PC
cor <- data$cor
cos2 <- data$cos2
contrib <- data$contrib # contributions of variables
# Combine
df.tmp <- (data.frame(cord) %>% rownames_to_column("ID")) %>% left_join(sample_data(ps.temp) %>% rownames_to_column("ID") )
# compare_means(Dim.1 ~ Treatment, method = "t.test", df.tmp) #0.315
# compare_means(Dim.2 ~ Treatment, method = "t.test", df.tmp) #0.908
# compare_means(Dim.3 ~ Treatment, method = "t.test", df.tmp) #0.0075 **
# compare_means(Dim.4 ~ Treatment, method = "t.test", df.tmp) #0.205
Assessing if there were significant differences in the variation of
Principal Components (PCs) between groups (control and treatment)
library(factoextra)
library(compositions)
ps.temp <- psAVE
# summary(otu_table(ps.temp))
pca <- prcomp(data.frame(otu_table(ps.temp)), scale = FALSE, center = TRUE)
# otu_table(ps.temp)
data <- get_pca(pca)
cord <- data$coord
cor <- data$cor
cos2 <- data$cos2
contrib <- data$contrib
df.tmp <- (data.frame(cord) %>% rownames_to_column("ID")) %>%
left_join(sample_data(ps.temp) %>% rownames_to_column("ID") )
# compare_means(Dim.1 ~ Treatment, method = "t.test", df.tmp) #0.133
# compare_means(Dim.2 ~ Treatment, method = "t.test", df.tmp) #0.397
# compare_means(Dim.3 ~ Treatment, method = "t.test", df.tmp) #0.026 *
# compare_means(Dim.4 ~ Treatment, method = "t.test", df.tmp) #0.502
Assessing if there were significant differences in the variation of Principal Components (PCs) between groups (control and treatment)
library(factoextra)
library(compositions)
ps.temp <- psATRclr
pca <- prcomp(otu_table(ps.temp), scale = FALSE, center = FALSE)
# summary(otu_table(ps.temp))
# otu_table(ps.temp)
data <- get_pca(pca)
cord <- data$coord
cor <- data$cor
cos2 <- data$cos2
contrib <- data$contrib
df.tmp <- (data.frame(cord) %>% rownames_to_column("ID")) %>% left_join(sample_data(ps.temp) %>% rownames_to_column("ID") )
# compare_means(Dim.1 ~ Treatment, method = "t.test", df.tmp) #0.281
# compare_means(Dim.2 ~ Treatment, method = "t.test", df.tmp) #0.627
# compare_means(Dim.3 ~ Treatment, method = "t.test", df.tmp) #0.00117 **
# compare_means(Dim.4 ~ Treatment, method = "t.test", df.tmp) # 0.518
Assessing if there were significant differences in the variation of Principal Components (PCs) between groups (control and treatment)
library(factoextra)
library(compositions)
my_comparisons <- list(c(1, 2))
ps.temp <- psAVEclr
# summary(otu_table(ps.temp))
pca <- prcomp(data.frame(otu_table(ps.temp)), scale = FALSE, center = FALSE) # FALSE FOR BOTH - CLR data should already be centred and scaled.
# otu_table(ps.temp)
data <- get_pca(pca)
cord <- data$coord
cor <- data$cor
cos2 <- data$cos2
contrib <- data$contrib
df.tmp <- (data.frame(cord) %>% rownames_to_column("ID")) %>% left_join(sample_data(ps.temp) %>% rownames_to_column("ID") )
# compare_means(Dim.1 ~ Treatment, method = "t.test", df.tmp) #0.172
# compare_means(Dim.2 ~ Treatment, method = "t.test", df.tmp) #0.708
# compare_means(Dim.3 ~ Treatment, method = "t.test", df.tmp) #0.0228 *
# compare_means(Dim.4 ~ Treatment, method = "t.test", df.tmp) #0.613
Assessing if there were significant differences in the variation of Principal Components (PCs) between groups (control and treatment)
Unsure why the clr-transformed ATR spectra resulted in such different PC3 contributions compared to clr-transformed AVE spectra.
This needs further assessments/regression etc as it was done without any consideration to data distribution / validity.
library(ggcorrplot)
# Compute a correlation matrix
df.cor <- df.tmp %>% filter(ID != "S963_AD6") %>% # filter this sample as it has NAs
dplyr::select(Dim.1, Dim.2, Dim.3, Dim.4, Dim.5, Gasflow_mL, C, H_pct, N_pct, CN, HC, Ethanol:Hexanoic.acid, ngmL_qb, DNA_mg_gCOD)
corr <- round(cor(df.cor, method = "spearman"), 1)
p <- ggcorrplot(corr)
p + ggtitle("AVE clr correlations to variables")
#ggsave(plot = p, "./Figures/PC3correlations_clr.png", height=18, width=18, units='cm', dpi=300)